####--------------------------------------------####
# Title: Semi-parametric Selection Models for Potentially Non-ignorable 
# Attrition in Panel Studies with Refreshment Samples
# Manuscript ID PA-2013-094
# Y.SI, J.REITER AND S.HILLYGUS
####--------------------------------------------####
###--Compare semi-parametric AN model with Amelia--###
###--Data Simulation and Amelia Imputation--###
####--------------------------------------------####
###Author: YAJUAN SI
###Latest edit date: 03/30/2014
####--------------------------------------------####
library(Amelia)
library(ggplot2)
set.seed(20140220)
####-------------loglinear prob function-----------------------####
logcellpr <- function(X1,X2,X3,X4,X5){
  value <- -2*(X1+X2+X3+X4+X5)+X1*X2+X1*X3+X1*X4+
    X1*X5+X2*X3+X2*X4+X2*X5+
    X3*X4-X3*X5-X4*X5+
    X1*X2*X3-X2*X3*X4+X3*X4*X5+
    X2*X3*X5-2*X1*X4*X5  
  return(value)
}
####-------------loglinear model for first 5 variables-----------------####
logpr <- rep(0,2^5)
index=0
for(i1 in 1:2){
  for (i2 in 1:2){
    for (i3 in 1:2){
      for (i4 in 1:2){
        for (i5 in 1:2){
          index <- index+1
          logpr[index] <- logcellpr(i1-1, i2-1,i3-1,i4-1, i5-1)
        }
      }
    }
  }
}

cellpr <- exp(logpr)/sum(exp(logpr))
####----------------general parameters--------------------####
Re<-100 # #of replications
n<-1000 # total sample size
samplesize<-n
Np<-800 # panel sample size
Nr<-200 # refreshment sample size
####----------------output files--------------------####
# 10 variables
sampledata<-array(0,c(Re,n,10)) 
misdata<-array(0,c(Re,n,10))
samplew<-matrix(0,Re,n)
p.index<-matrix(0,Re,Np)
r.index<-matrix(0,Re,Nr)
####----------------initial table for first 5 variables--------------------####
###
#initial setting, will be masked later
X <- matrix(0,samplesize,5)
for (j in 1:5){
  X[,j] <- rbinom(samplesize,1,0.25+0.1*j)
}

colnames(X)=c("x1","x2","x3","x4","x5")

contigency=table(X[,1],X[,2],X[,3],X[,4],X[,5],dnn = c("x1","x2","x3","x4","x5"))

Xtry <- as.data.frame(contigency)
###
####----------------replication process-----------------------####
for (re in 1:Re){
  set.seed(20140212+re)
  sample.data <- matrix(9,samplesize,10)
###generate first 5 variables
  sample.datafreq<-rmultinom(1,samplesize,cellpr)
  Xtry$Freq<- sample.datafreq
  X <- NULL
  for (j in 1:sample.datafreq[1]){
    X <- rbind(X,as.numeric(Xtry[1,1:5])-1)
  }
  for (i in 2:32){
    if (sample.datafreq[i]>0){
      for (j in (sum(sample.datafreq[1:(i-1)])+1):sum(sample.datafreq[1:i])){  
        X <- rbind(X,as.numeric(Xtry[i,1:5])-1) 
      }
    }
  }

sample.data[,1:5] <- X
#generate the 6th variables
logy6 <- 1.5*sample.data[,1]+0.2*sample.data[,2]-0.2*sample.data[,3]+0.1*sample.data[,4]+0.2*sample.data[,5]-1.2*sample.data[,1]*sample.data[,2]

sample.data[,6] <- rbinom(samplesize,1,1/(1+exp(-logy6)))
#generate the 7th variables
logy7 <- -0.1*sample.data[,1]+1*sample.data[,2]+0.2*sample.data[,3]+0.1*sample.data[,4]+0.1*sample.data[,5]-
  sample.data[,6]-0.5*sample.data[,2]*sample.data[,3]

sample.data[,7] <- rbinom(samplesize,1,1/(1+exp(-logy7)))
#generate the 8th variables
logy8 <- 1.2*sample.data[,3]+0.2*sample.data[,5]+0.1*sample.data[,6]-0.2*sample.data[,7]-
  0.5*sample.data[,3]*sample.data[,6]+ 0.2*sample.data[,6]*sample.data[,7]+
  1.1*sample.data[,3]*sample.data[,7]-0.4*sample.data[,3]*sample.data[,6]*sample.data[,7]

sample.data[,8] <- rbinom(samplesize,1,1/(1+exp(-logy8)))
#generate the 9th variables
logy9 <- 1*sample.data[,4]-0.2*sample.data[,2]+0.1*sample.data[,7]-0.3*sample.data[,8]-1.5*sample.data[,4]*sample.data[,8]

sample.data[,9] <- rbinom(samplesize,1,1/(1+exp(-logy9)))
#generate the 10th variables
logy10 <- -0.5*sample.data[,4]+1*sample.data[,5]+0.2*sample.data[,8]+0.2*sample.data[,9]-
  1.5*sample.data[,5]*sample.data[,8]+1*sample.data[,8]*sample.data[,9]

sample.data[,10] <- rbinom(samplesize,1,1/(1+exp(-logy10)))
#simulate W
w<-rep(1,samplesize)
w_u<- 2.5*sample.data[,10] - 1*sample.data[,7] + 0.5 * sample.data[,1]- 0.5 * sample.data[,2]
w[runif(samplesize)>pnorm(w_u)]<-0
#save data
sampledata[re,,]<-sample.data #true data
samplew[re,]<-w
#data with missing records
misdata[re,,]<-sample.data
#sample panel data
panelindex<-sample(1:n,Np,replace=FALSE)
p.index[re,]<-panelindex
#refreshment sample index
r.index[re,]<-(1:n)[-panelindex]
#missing Y2 in attrited panel and Y1 in refreshment sample
misdata[re,panelindex[w[panelindex]==0],6:10]<-999
misdata[re,-panelindex,1:5]<-999
}



####----transfer sampledata and misdata array to be matrix----####
sample.mtx<-matrix(0,Re*n,11)
mis.mtx<-matrix(0,Re*n,11)

for (re in 1:Re){
  sample.mtx[((re-1)*n+1):(re*n),1]<-rep(re,n)
  sample.mtx[((re-1)*n+1):(re*n),2:11]<-sampledata[re,,]

  mis.mtx[((re-1)*n+1):(re*n),1]<-rep(re,n)
  mis.mtx[((re-1)*n+1):(re*n),2:11]<-misdata[re,,]  
}
        
#true sample data for each replication
write.table(sample.mtx,row.names = FALSE,col.names=FALSE, "data/sample-simulation.csv",sep = ",") 
#observed sample data for each replication
write.table(mis.mtx,row.names = FALSE,col.names=FALSE,"data/misdata-simulation.csv",sep = ",") 

#W for each replication
write.table(samplew,row.names = FALSE,col.names=FALSE, "data/samplew-simulation.csv",sep = ",")
#panel index for each replication
write.table(p.index,row.names = FALSE,col.names=FALSE, "data/panelindex-simulation.csv",sep = ",") 
#refreshment sample for each replication
write.table(r.index,row.names = FALSE,col.names=FALSE, "data/refindex-simulation.csv",sep = ",") 
#pool all generated data together
wholedata<-data.frame(sample.mtx[,2:11])
write.table(wholedata,row.names = FALSE,col.names=FALSE, "data/pop-simulation.csv",sep = ",")

names(wholedata)<-c("Y1","Y2","Y3","Y4","Y5","Y6","Y7","Y8","Y9","Y10")
#true values of quantities of interest
true_q<-c(apply(wholedata,2,mean),mean(wholedata[,1]==1&wholedata[,6]==1),
          mean(wholedata[,2]==1&wholedata[,7]==1),mean(wholedata[,3]==1&wholedata[,8]==1),mean(wholedata[,4]==1&wholedata[,9]==1),
          mean(wholedata[,5]==1&wholedata[,10]==1),mean(wholedata[,6]==1&wholedata[,10]==1),mean(wholedata[,7]==1&wholedata[,10]==1),
          mean(wholedata[,8]==1&wholedata[,10]==1),mean(wholedata[,9]==1&wholedata[,10]==1))
#----------------------------------------------------------------#
####--------------Amelia-------------------####
#----------------------------------------------------------------#
m<-20 # #of multiple imputations
####--------------output files-------------------####
#mean/variance estimates/CI intervals for MI
cover<-matrix(0,Re,length(true_q))
q_mi<-matrix(0,Re,length(true_q)); u_mi<-matrix(0,Re,length(true_q)); 
b_mi<-matrix(0,Re,length(true_q)); t_mi<-matrix(0,Re,length(true_q)); 
low_mi<-matrix(0,Re,length(true_q)); 
up_mi<-matrix(0,Re,length(true_q))

q_prob<-matrix(0,m,10); u_prob<-matrix(0,m,10);
q_prob_j<-matrix(0,m,5); u_prob_j<-matrix(0,m,5);
q_prob_j2<-matrix(0,m,4); u_prob_j2<-matrix(0,m,4);


####--------------MI via Amelia-------------------####
for (re in 1:Re){
  paneldata<-data.frame(misdata[re,p.index[re,],])
  paneldata[paneldata==999]<- NA
  names(paneldata)<-c("Y1","Y2","Y3","Y4","Y5","Y6","Y7","Y8","Y9","Y10")
  #main MI
  a.out<-amelia(paneldata,m=20,p2s=0,noms=c("Y1","Y2","Y3","Y4","Y5","Y6","Y7","Y8","Y9","Y10"))
  #inference
  for (l in 1:m){
    data_l<-a.out$imputations[[l]]
    q_prob[l,]<-apply(data_l==1,2,mean)
    u_prob[l,]<-apply(data_l==1,2,mean) * apply(data_l==0,2,mean)/Np
    for (j in 1:5){
    q_prob_j[l,j]<-mean(data_l[,j]==1 & data_l[,5+j]==1)
    u_prob_j[l,j]<-q_prob_j[l,j]*(1-q_prob_j[l,j])/Np
    }
    for (j in 1:4){
      q_prob_j2[l,j]<-mean(data_l[,j+5]==1 & data_l[,10]==1)
      u_prob_j2[l,j]<-q_prob_j2[l,j]*(1-q_prob_j2[l,j])/Np      
    }
  }
  q_mi[re,]<-c(apply(q_prob,2,mean),apply(q_prob_j,2,mean),apply(q_prob_j2,2,mean))
  u_mi[re,]<-c(apply(u_prob,2,mean),apply(u_prob_j,2,mean),apply(u_prob_j2,2,mean))
  b_mi[re,]<-c(apply(q_prob,2,var),apply(q_prob_j,2,var),apply(q_prob_j2,2,var))
  t_mi[re,]<-(1+1/m)*b_mi[re,]+u_mi[re,]
  df_mi<-(m-1)*((1+u_mi[re,]/b_mi[re,]/(1+1/m))^2)
  low_mi[re,]<-q_mi[re,]-qt(0.975,df_mi) * sqrt(t_mi[re,])
  up_mi[re,]<-q_mi[re,]+qt(0.975,df_mi) * sqrt(t_mi[re,])
  
  for (j in 1:length(true_q)){
    if (low_mi[re,j]<=true_q[j] & true_q[j]<=up_mi[re,j]){
      cover[re,j]<-1
    }
  }
}
apply(t_mi,2,mean) #variance
apply(cover,2,mean) #nominal coverage rates
apply(q_mi,2,mean)-true_q # bias
rmse0<-rep(0,length(true_q)) 
for (re in 1:Re){
rmse0<-rmse0+(q_mi[re,]-true_q)^2
}
rmse<-sqrt(rmse0/Re) #RMSE

save.image(file="simulation-amelia.RData")
